December 3, 2020

Research question

Do progressives and conservatives read books of opposing views as often as they read books of their own views?

Main Dataset

  • Collected in late 2017 from Goodreads.com users' public shelves by UC San Diego.

  • Prior to filtering, 228,648,342 user-book interactions; 2,360,655 books; 876,145 users.

  • Each observation is a book the user has claimed to have read.

  • Once filtered for user-book interactions of users that have read at least one progressive book, the df_progressive_users dataframe contains 139,669,596 observations (books read).

  • Once filtered for user-book interactions of users that have read at least one conservative book, the df_conservative_users dataframe contains 93,377,104 observations (books read).

Supplementary Dataset

  • df_progressive_books dataframe contains the top 480 progressive books as voted for by Goodreads.com users.

  • df_conservative_books dataframe contains the top 480 conservative books as voted for by Goodreads.com users.

  • incorrectly_labeled_progressive dataframe contains books to be removed from df_progressive_books due to them not being clearly progressive (e.g. Hunger Games, 1984).

  • incorrectly_labeled_conservative dataframe contains books to be removed from df_conservative_books due to them not being clearly conservative (e.g. Hunger Games, 1984).

Histogram code: Single vs Multi-Type Users

prog_cons_or_both <- function(df) {
   df_users_labels <- tibble("user_id" = 1, "type" = 1)
   for(user in unique(df$user_id)) {
      user_temp <- df[df$user_id == user, ]
      df_prog <- filter(user_temp, (user_temp$book_id %in% df_progressive_books$book_id))
      df_cons <- filter(user_temp, (user_temp$book_id %in% df_conservative_books$book_id))
      if (length(df_prog$X1)) {
      df_users_labels <- df_users_labels %>% add_row("user_id" = user, "type" = 1)
      }
      if (length(df_cons$X1)) {
      df_users_labels <- df_users_labels %>% add_row("user_id" = user, "type" = 1)
      }
   }
  df_users_labels <- df_users_labels[-1,]
  df_agg <- aggregate(df_users_labels$type, by=list(User=df_users_labels$user_id), FUN=sum)
  df_agg <- arrange(df_agg, x)
  df_agg
}

df_user_type_prog <- prog_cons_or_both(df_progressive_users)
df_user_type_cons <- prog_cons_or_both(df_conservative_users)

Histogram: Progressives vs Multi-Type Users

hist(df_user_type_prog$x, breaks = seq(from=0, to=2, by=1), xlab="Progressives / Multi", 
     ylab = "Numer of users", plot = TRUE, labels= TRUE, main = "Progressives vs Multi",
     col = "blue1", ylim=c(0,950))

Histogram: Conservatives vs Multi-Type Users

hist(df_user_type_cons$x, breaks = seq(from=0, to=2, by=1), xlab="Conservatives / Multi", 
     ylab = "Numer of users", plot = TRUE, labels= TRUE, main = "Conservatives vs Multi", 
     col = "red1", ylim=c(0,650))

Prepare data for ANOVA

# Calculate the counts of the type of books read by a type of user
anova_format <- function(df_users, df_book_type, group_type) {
  df_group <- tibble("group" = "temp", "count" = 1)
   for(user in unique(df_users$user_id)) {
      user_temp <- df_users[df_users$user_id == user, ]
      num_books <- filter(user_temp, (user_temp$book_id %in% df_book_type$book_id))
      df_group <- df_group %>% add_row("group" = group_type, 
                                       "count" = length(num_books$book_id))
   }
  df_group[-1,]
}

df_pp <- anova_format(df_progressive_users, df_progressive_books, "prog_prog")
df_pc <- anova_format(df_progressive_users, df_conservative_books, "prog_cons")
df_cc <- anova_format(df_conservative_users, df_conservative_books, "cons_cons")
df_cp <- anova_format(df_conservative_users, df_progressive_books, "cons_prog")

df_anova <- bind_rows(df_pp,df_pc,df_cc,df_cp)
df_anova_ro <- bind_rows(df_pp,df_pc,df_cc,df_cp)

Summary statistics

# Summary statistics for each group
df_anova %>%
group_by(group) %>%
  summarise(
    n = n(),
    mean = mean(count),
    sd = sd(count),
    median = median(count),
    min = min(count),
    max = max(count)
  )
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 7
##   group         n  mean    sd median   min   max
##   <chr>     <int> <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1 cons_cons  1000 1.30  0.748      1     1    11
## 2 cons_prog  1000 1.05  1.34       1     0    12
## 3 prog_cons  1000 0.448 0.880      0     0    12
## 4 prog_prog  1000 1.50  1.00       1     1    13

Boxplots code

user_reads_dist_plot_out <- ggboxplot(df_anova, x = "group", y = "count", 
          color = "group", palette = c("#00AFBB", "#E7B800", "#FC4E07", "#FC4E07"),
          order = c("prog_prog", "prog_cons", 
                    "cons_cons", "cons_prog"),
          ylab = "Count", xlab = "Group")

Boxplots

user_reads_dist_plot_out

Distribution plots code

# Plot each group's distribution
variability_groups_plot <- function(df, x) {
    p <- ggplot(data=df, aes(x=x, group=group, fill=group)) +
    geom_histogram(binwidth = 0.5) +
    theme_ipsum() +
    facet_wrap(~group) +
    theme(
      legend.position="none",
      panel.spacing = unit(0.1, "lines"),
      axis.ticks.x=element_blank()
    )
}

Distribution plots

plot_count_out <- variability_groups_plot(df_anova, df_anova$count)
plot_count_out

Transformations

transform_data <- function(df) {
  # Square root transformation
  df <- df %>% mutate(sqrt_count = df$count^0.5)
  
  # Log transformation (add 1 to correct for zeros)
  df <- df %>% mutate(log_count = log(df$count+1))
  df
}
df_anova <- transform_data(df_anova)

Distribution plots (log transformed)

plot_log <- variability_groups_plot(df_anova, df_anova$log_count)
plot_log

Choosing inference to perform

ANOVA, linear model

  • Passes independence assumption
  • Fails nearly normal assumption
  • Fails nearly constant variance assumption

Kruskal-Wallis / Mann Whitney U

  • Passes independence assumption
  • Passes ordinal / ratio / interval scale assumption
  • Potentially fails nearly same shape distribution assumption

Choosing inference to perform (continued)

Welch's ANOVA

  • Passes independence assumption
  • Fails nearly normal assumption

ANOVA, generalized linear model (Poisson)

  • Passes independence assumption
  • Passes follows Poisson distribution assumption
  • Passes non-negative integer assumption

ANOVA, generalized linear model (Poisson)

GLMs

  • Linear predictor
  • Link function
  • Error structure

Poisson

  • Ideal for count data
  • Mean and variance are the same

Chi-Square Test

  • Compares expected frequencies and the observed frequencies

Cramer's V

  • Equivalent to the correlation coefficient r
  • For df of 1: .1 signifies a small effect, .3 a medium effect, .5 a large effect
  • For df of 3: .06 signifies a small effect, .17 a medium effect, .29 a large effect

\(V=\sqrt{\frac{x^2}{n\cdot df}}\)

Kruskal-Wallis / Mann-Whitney U

  • Non parametric tests (no distribution assumption)
  • Use ranks of data, rather than actual values
  • Test if two samples are from the same population

Kruskal-Wallis Steps

  1. Sort the data in a combined set
  2. Assign ranks to the data points
  3. Add up the different ranks for each group
  4. Calculate the H statistic:

Mann Whitney GLM summary

Eta squared / R

Eta squared

  • Quantifies the percentage of variance explained by a predictor variable
  • Is analogous to R squared

R

  • .1 signifies a small effect, .3 a medium effect, .5 a large effect

\(r = \frac{Z}{√N}\)

Groups

df_anova_pp_pc <- filter(df_anova, (df_anova$group == "prog_prog")
                                     | (df_anova$group == "prog_cons"))
df_anova_pp_cc <- filter(df_anova, (df_anova$group == "prog_prog")
                                     | (df_anova$group == "cons_cons"))
df_anova_pp_cp <- filter(df_anova, (df_anova$group == "prog_prog")
                                     | (df_anova$group == "cons_prog"))
df_anova_pc_cc <- filter(df_anova, (df_anova$group == "prog_cons")
                                     | (df_anova$group == "cons_cons"))
df_anova_pc_cp <- filter(df_anova, (df_anova$group == "prog_cons")
                                     | (df_anova$group == "cons_prog"))
df_anova_cc_cp <- filter(df_anova, (df_anova$group == "cons_cons")
                                     | (df_anova$group == "cons_prog"))

Inference: GLM ANOVA (multi group)

glm_model <- glm(df_anova$count ~ as.factor(df_anova$group), poisson) 
anova(glm_model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: poisson, link: log
## 
## Response: df_anova$count
## 
## Terms added sequentially (first to last)
## 
## 
##                           Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                       3999     4193.7              
## as.factor(df_anova$group)  3    665.5      3996     3528.2 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cramerV(df_anova$group, df_anova$count)
## Cramer V 
##    0.397

Post Hoc Tests: GLM ANOVA (1 of 6)

# pp vs pc
glm_model <- glm(df_anova_pp_pc$count ~ as.factor(df_anova_pp_pc$group), poisson) 
anova(glm_model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: poisson, link: log
## 
## Response: df_anova_pp_pc$count
## 
## Terms added sequentially (first to last)
## 
## 
##                                 Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                             1999     2274.4              
## as.factor(df_anova_pp_pc$group)  1   602.17      1998     1672.3 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cramerV(df_anova_pp_pc$group, df_anova_pp_pc$count)
## Cramer V 
##   0.7328

Post Hoc Tests Summary: GLM ANOVA

ANOVA GLM summary

Inference: Kruskal-Wallis (multi group)

kruskal.test(count ~ group, data = df_anova) 
## 
##  Kruskal-Wallis rank sum test
## 
## data:  count by group
## Kruskal-Wallis chi-squared = 1086.2, df = 3, p-value < 2.2e-16
kruskal_effsize(count ~ group, data = df_anova) # eta squared
## # A tibble: 1 x 5
##   .y.       n effsize method  magnitude
## * <chr> <int>   <dbl> <chr>   <ord>    
## 1 count  4000   0.271 eta2[H] large

Post Hoc Tests: Mann-Whitney U (1 of 6)

# pp vs pc
wilcox.test(count ~ group, data = df_anova_pp_pc)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  count by group
## W = 149988, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox_effsize(count ~ group, data = df_anova_pp_pc)
## # A tibble: 1 x 7
##   .y.   group1    group2    effsize    n1    n2 magnitude
## * <chr> <chr>     <chr>       <dbl> <int> <int> <ord>    
## 1 count prog_cons prog_prog   0.652  1000  1000 large

Post Hoc Tests Summary: Mann-Whitney U

Mann Whitney GLM summary

Conclusions / Limitations

Limitations

  • The set of progressive books may be more popular.

  • Validity of Kruskal-Wallis and Mann-Whitney U Tests

Conclusions

  • Progressives and conservatives do not read books of opposing views as often as they read books of their own views.

Thank You

References (1 of 2)

M.J. Crawley (2005) "Statistics: An Introduction Using R"

Alboukadel Kassambara "One-Way ANOVA Test in R"

Alboukadel Kassambara "rstatix v0.6.0"

Stephanie Glen "Kruskal Wallis H Test: Definition, Examples & Assumptions"

Stephanie Glen "Mann Whitney U Test"

Nick Hendershot "GLM with zero-inflated data"

Mengting Wan, Julian McAuley, "Item Recommendation on Monotonic Behavior Chains", in RecSys'18. bibtex

Mengting Wan, Rishabh Misra, Ndapa Nakashole, Julian McAuley, "Fine-Grained Spoiler Detection from Large-Scale Review Corpora", in ACL'19. bibtex

References (2 of 2)

Appendix: Additional inference with outliers slides

Inference: GLM ANOVA Post Hoc Tests (2 of 6)

# pp vs cc
qp_glm_model <- glm(df_anova_pp_cc$count ~ as.factor(df_anova_pp_cc$group), poisson) 
anova(qp_glm_model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: poisson, link: log
## 
## Response: df_anova_pp_cc$count
## 
## Terms added sequentially (first to last)
## 
## 
##                                 Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                             1999     770.18              
## as.factor(df_anova_pp_cc$group)  1   14.865      1998     755.31 0.0001155 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cramerV(df_anova_pp_cc$group, df_anova_pp_cc$count)
## Cramer V 
##   0.1442

Inference: GLM ANOVA Post Hoc Tests (3 of 6)

# pp vs cp
qp_glm_model <- glm(df_anova_pp_cp$count ~ as.factor(df_anova_pp_cp$group), poisson) 
anova(qp_glm_model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: poisson, link: log
## 
## Response: df_anova_pp_cp$count
## 
## Terms added sequentially (first to last)
## 
## 
##                                 Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                             1999     2110.2              
## as.factor(df_anova_pp_cp$group)  1   80.807      1998     2029.4 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cramerV(df_anova_pp_cp$group, df_anova_pp_cp$count)
## Cramer V 
##   0.5372

Inference: GLM ANOVA Post Hoc Tests (4 of 6)

# pc vs cc
qp_glm_model <- glm(df_anova_pc_cc$count ~ as.factor(df_anova_pc_cc$group), poisson) 
anova(qp_glm_model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: poisson, link: log
## 
## Response: df_anova_pc_cc$count
## 
## Terms added sequentially (first to last)
## 
## 
##                                 Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                             1999     1931.5              
## as.factor(df_anova_pc_cc$group)  1   432.72      1998     1498.8 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cramerV(df_anova_pc_cc$group, df_anova_pc_cc$count)
## Cramer V 
##   0.7361

Inference: GLM ANOVA Post Hoc Tests (5 of 6)

# pc vs cp
qp_glm_model <- glm(df_anova_pc_cp$count ~ as.factor(df_anova_pc_cp$group), poisson) 
anova(qp_glm_model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: poisson, link: log
## 
## Response: df_anova_pc_cp$count
## 
## Terms added sequentially (first to last)
## 
## 
##                                 Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                             1999     3021.8              
## as.factor(df_anova_pc_cp$group)  1    248.9      1998     2772.9 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cramerV(df_anova_pc_cp$group, df_anova_pc_cp$count)
## Cramer V 
##   0.2934

Inference: GLM ANOVA Post Hoc Tests (6 of 6)

# cc vs cp
qp_glm_model <- glm(df_anova_cc_cp$count ~ as.factor(df_anova_cc_cp$group), poisson) 
anova(qp_glm_model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: poisson, link: log
## 
## Response: df_anova_cc_cp$count
## 
## Terms added sequentially (first to last)
## 
## 
##                                 Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                             1999     1882.4              
## as.factor(df_anova_cc_cp$group)  1   26.444      1998     1855.9 2.712e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cramerV(df_anova_cc_cp$group, df_anova_cc_cp$count)
## Cramer V 
##   0.5685

Inference: Mann-Whitney Post Hoc Tests (2 of 6)

# Mann-Whitney: pp vs cc
wilcox.test(count ~ group, data = df_anova_pp_cc)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  count by group
## W = 444247, p-value = 1.675e-08
## alternative hypothesis: true location shift is not equal to 0
wilcox_effsize(count ~ group, data = df_anova_pp_cc) # r ( = Z/(√Nobs))
## # A tibble: 1 x 7
##   .y.   group1    group2    effsize    n1    n2 magnitude
## * <chr> <chr>     <chr>       <dbl> <int> <int> <ord>    
## 1 count cons_cons prog_prog   0.126  1000  1000 small

Inference: Mann-Whitney Post Hoc Tests (3 of 6)

# Mann-Whitney: pp vs cp
wilcox.test(count ~ group, data = df_anova_pp_cp)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  count by group
## W = 328846, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox_effsize(count ~ group, data = df_anova_pp_cp) # r ( = Z/(√Nobs))
## # A tibble: 1 x 7
##   .y.   group1    group2    effsize    n1    n2 magnitude
## * <chr> <chr>     <chr>       <dbl> <int> <int> <ord>    
## 1 count cons_prog prog_prog   0.321  1000  1000 moderate

Inference: Mann-Whitney Post Hoc Tests (4 of 6)

# Mann-Whitney: pc vs cc
wilcox.test(count ~ group, data = df_anova_pc_cc)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  count by group
## W = 832812, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox_effsize(count ~ group, data = df_anova_pc_cc) # r ( = Z/(√Nobs))
## # A tibble: 1 x 7
##   .y.   group1    group2    effsize    n1    n2 magnitude
## * <chr> <chr>     <chr>       <dbl> <int> <int> <ord>    
## 1 count cons_cons prog_cons   0.633  1000  1000 large

Inference: Mann-Whitney Post Hoc Tests (5 of 6)

# Mann-Whitney: pc vs cp
wilcox.test(count ~ group, data = df_anova_pc_cp)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  count by group
## W = 650232, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox_effsize(count ~ group, data = df_anova_pc_cp) # r ( = Z/(√Nobs))
## # A tibble: 1 x 7
##   .y.   group1    group2    effsize    n1    n2 magnitude
## * <chr> <chr>     <chr>       <dbl> <int> <int> <ord>    
## 1 count cons_prog prog_cons   0.290  1000  1000 small

Inference: Mann-Whitney Post Hoc Tests (6 of 6)

# Mann-Whitney: cc vs cp
wilcox.test(count ~ group, data = df_anova_cc_cp)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  count by group
## W = 639742, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox_effsize(count ~ group, data = df_anova_cc_cp) # r ( = Z/(√Nobs))
## # A tibble: 1 x 7
##   .y.   group1    group2    effsize    n1    n2 magnitude
## * <chr> <chr>     <chr>       <dbl> <int> <int> <ord>    
## 1 count cons_cons cons_prog   0.269  1000  1000 small

Appendix: EDA and Inference without outliers

Remove outliers (1 of 5)

Q <- quantile(df_anova_ro$count, probs=c(.25, .75), na.rm = FALSE)
iqr <- IQR(df_anova_ro$count)
df_anova_ro <- subset(df_anova_ro, df_anova_ro$count > (Q[1] - 1.5*iqr) & df_anova_ro$count < (Q[2]+1.5*iqr))

Summary statistics (2 of 5)

# Summary statistics for each group
df_anova_ro %>%
group_by(group) %>%
  summarise(
    n = n(),
    mean = mean(count),
    sd = sd(count),
    min = min(count),
    max = max(count)
  )
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 6
##   group         n  mean    sd   min   max
##   <chr>     <int> <dbl> <dbl> <dbl> <dbl>
## 1 cons_cons   944 1.16  0.365     1     2
## 2 cons_prog   888 0.681 0.730     0     2
## 3 prog_cons   973 0.351 0.603     0     2
## 4 prog_prog   883 1.22  0.414     1     2

Boxplots code (no outliers) (3 of 5)

user_reads_dist_plot <- ggboxplot(df_anova_ro, x = "group", y = "count", 
          color = "group", palette = c("#00AFBB", "#E7B800", "#FC4E07", "#FC4E07"),
          order = c("prog_prog", "prog_cons", 
                    "cons_cons", "cons_prog"),
          ylab = "Count", xlab = "Group")

Boxplots (no outliers) (4 of 5)

user_reads_dist_plot

Distribution plots (without outliers) (5 of 5)

plot_count <- variability_groups_plot(df_anova_ro, df_anova_ro$count)
plot_count

Transformations

transform_data <- function(df) {
  # Square root transformation
  df <- df %>% mutate(sqrt_count = df$count^0.5)
  
  # Log transformation (add 1 to correct for zeros)
  df <- df %>% mutate(log_count = log(df$count+1))
  df
}
df_anova_ro <- transform_data(df_anova_ro)

Summary statistics (log count)

# Summary statistics for each group
df_anova_ro %>%
group_by(group) %>%
  summarise(
    n = n(),
    mean = mean(log_count),
    sd = sd(log_count),
    min = min(log_count),
    max = max(log_count)
  )
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 6
##   group         n  mean    sd   min   max
##   <chr>     <int> <dbl> <dbl> <dbl> <dbl>
## 1 cons_cons   944 0.757 0.148 0.693  1.10
## 2 cons_prog   888 0.427 0.429 0      1.10
## 3 prog_cons   973 0.224 0.368 0      1.10
## 4 prog_prog   883 0.782 0.168 0.693  1.10

Distribution plots (log transformed)

plot_log <- variability_groups_plot(df_anova_ro, df_anova_ro$log_count)
plot_log

Groups without outliers

df_anova_pp_pc_ro <- filter(df_anova_ro, (df_anova_ro$group == "prog_prog")
                                     | (df_anova_ro$group == "prog_cons"))
df_anova_pp_cc_ro <- filter(df_anova_ro, (df_anova_ro$group == "prog_prog")
                                     | (df_anova_ro$group == "cons_cons"))
df_anova_pp_cp_ro <- filter(df_anova_ro, (df_anova_ro$group == "prog_prog")
                                     | (df_anova_ro$group == "cons_prog"))
df_anova_pc_cc_ro <- filter(df_anova_ro, (df_anova_ro$group == "prog_cons")
                                     | (df_anova_ro$group == "cons_cons"))
df_anova_pc_cp_ro <- filter(df_anova_ro, (df_anova_ro$group == "prog_cons")
                                     | (df_anova_ro$group == "cons_prog"))
df_anova_cc_cp_ro <- filter(df_anova_ro, (df_anova_ro$group == "cons_cons")
                                     | (df_anova_ro$group == "cons_prog"))

Assumption: Variability across the groups is about equal tests

# Bartlett: Parametric test of the equality of variances
bartlett.test(count ~ group, data=df_anova_ro)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  count by group
## Bartlett's K-squared = 556.71, df = 3, p-value < 2.2e-16
# Fligner-Killeen: Non-parametric test of the equality of variances
fligner.test(count ~ group, data=df_anova_ro)
## 
##  Fligner-Killeen test of homogeneity of variances
## 
## data:  count by group
## Fligner-Killeen:med chi-squared = 435.4, df = 3, p-value < 2.2e-16

Inference: GLM ANOVA (comparing all groups)

# GLM ANOVA
qp_glm_model <- glm(df_anova_ro$count ~ as.factor(df_anova_ro$group), quasipoisson) 
anova(qp_glm_model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: quasipoisson, link: log
## 
## Response: df_anova_ro$count
## 
## Terms added sequentially (first to last)
## 
## 
##                              Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                          3687     2569.7              
## as.factor(df_anova_ro$group)  3   616.58      3684     1953.1 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cramerV(df_anova_ro$group, df_anova_ro$count)
## Cramer V 
##   0.4867

Inference: GLM ANOVA Post Hoc Tests (1 of 6)

# pp vs pc
qp_glm_model <- glm(df_anova_pp_pc_ro$count ~ as.factor(df_anova_pp_pc_ro$group), quasipoisson) 
anova(qp_glm_model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: quasipoisson, link: log
## 
## Response: df_anova_pp_pc_ro$count
## 
## Terms added sequentially (first to last)
## 
## 
##                                    Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                                1855     1481.5              
## as.factor(df_anova_pp_pc_ro$group)  1   473.62      1854     1007.9 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cramerV(df_anova_pp_pc_ro$group, df_anova_pp_pc_ro$count)
## Cramer V 
##   0.7389

Inference: GLM ANOVA Post Hoc Tests (2 of 6)

# pp vs cc
qp_glm_model <- glm(df_anova_pp_cc_ro$count ~ as.factor(df_anova_pp_cc_ro$group), quasipoisson) 
anova(qp_glm_model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: quasipoisson, link: log
## 
## Response: df_anova_pp_cc_ro$count
## 
## Terms added sequentially (first to last)
## 
## 
##                                    Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                                1826     203.86              
## as.factor(df_anova_pp_cc_ro$group)  1   1.4169      1825     202.45 0.0008453 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cramerV(df_anova_pp_cc_ro$group, df_anova_pp_cc_ro$count)
## Cramer V 
##  0.07781

Inference: GLM ANOVA Post Hoc Tests (3 of 6)

# pp vs cp
qp_glm_model <- glm(df_anova_pp_cp_ro$count ~ as.factor(df_anova_pp_cp_ro$group), quasipoisson) 
anova(qp_glm_model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: quasipoisson, link: log
## 
## Response: df_anova_pp_cp_ro$count
## 
## Terms added sequentially (first to last)
## 
## 
##                                    Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                                1770    1098.62              
## as.factor(df_anova_pp_cp_ro$group)  1   136.43      1769     962.19 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cramerV(df_anova_pp_cp_ro$group, df_anova_pp_cp_ro$count)
## Cramer V 
##   0.5637

Inference: GLM ANOVA Post Hoc Tests (4 of 6)

# pc vs cc
qp_glm_model <- glm(df_anova_pc_cc_ro$count ~ as.factor(df_anova_pc_cc_ro$group), quasipoisson) 
anova(qp_glm_model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: quasipoisson, link: log
## 
## Response: df_anova_pc_cc_ro$count
## 
## Terms added sequentially (first to last)
## 
## 
##                                    Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                                1916    1427.25              
## as.factor(df_anova_pc_cc_ro$group)  1   436.34      1915     990.91 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cramerV(df_anova_pc_cc_ro$group, df_anova_pc_cc_ro$count)
## Cramer V 
##   0.7469

Inference: GLM ANOVA Post Hoc Tests (5 of 6)

# pc vs cp
qp_glm_model <- glm(df_anova_pc_cp_ro$count ~ as.factor(df_anova_pc_cp_ro$group), quasipoisson) 
anova(qp_glm_model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: quasipoisson, link: log
## 
## Response: df_anova_pc_cp_ro$count
## 
## Terms added sequentially (first to last)
## 
## 
##                                    Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                                1860     1850.7              
## as.factor(df_anova_pc_cp_ro$group)  1   100.03      1859     1750.7 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cramerV(df_anova_pc_cp_ro$group, df_anova_pc_cp_ro$count)
## Cramer V 
##   0.2483

Inference: GLM ANOVA Post Hoc Tests (6 of 6)

# cc vs cp
qp_glm_model <- glm(df_anova_cc_cp_ro$count ~ as.factor(df_anova_cc_cp_ro$group), quasipoisson) 
anova(qp_glm_model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model: quasipoisson, link: log
## 
## Response: df_anova_cc_cp_ro$count
## 
## Terms added sequentially (first to last)
## 
## 
##                                    Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                                1831    1059.23              
## as.factor(df_anova_cc_cp_ro$group)  1   113.99      1830     945.24 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cramerV(df_anova_cc_cp_ro$group, df_anova_cc_cp_ro$count)
## Cramer V 
##   0.5815

Inference: Kruskal-Wallis (comparing all groups)

# Kruskal-Wallis (test for non-normal distributions)
kruskal.test(count ~ group, data = df_anova) 
## 
##  Kruskal-Wallis rank sum test
## 
## data:  count by group
## Kruskal-Wallis chi-squared = 1086.2, df = 3, p-value < 2.2e-16
kruskal_effsize(count ~ group, data = df_anova) # eta squared
## # A tibble: 1 x 5
##   .y.       n effsize method  magnitude
## * <chr> <int>   <dbl> <chr>   <ord>    
## 1 count  4000   0.271 eta2[H] large

Inference: Mann-Whitney Post Hoc Tests (1 of 6)

# Mann-Whitney: pp vs pc
wilcox.test(count ~ group, data = df_anova_pp_pc_ro)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  count by group
## W = 124359, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox_effsize(count ~ group, data = df_anova_pp_pc_ro) # r ( = Z/(√Nobs))
## # A tibble: 1 x 7
##   .y.   group1    group2    effsize    n1    n2 magnitude
## * <chr> <chr>     <chr>       <dbl> <int> <int> <ord>    
## 1 count prog_cons prog_prog   0.674   973   883 large

Inference: Mann-Whitney Post Hoc Tests (2 of 6)

# Mann-Whitney: pp vs cc
wilcox.test(count ~ group, data = df_anova_pp_cc_ro)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  count by group
## W = 391464, p-value = 0.0008849
## alternative hypothesis: true location shift is not equal to 0
wilcox_effsize(count ~ group, data = df_anova_pp_cc_ro) # r ( = Z/(√Nobs))
## # A tibble: 1 x 7
##   .y.   group1    group2    effsize    n1    n2 magnitude
## * <chr> <chr>     <chr>       <dbl> <int> <int> <ord>    
## 1 count cons_cons prog_prog  0.0778   944   883 small

Inference: Mann-Whitney Post Hoc Tests (3 of 6)

# Mann-Whitney: pp vs cp
wilcox.test(count ~ group, data = df_anova_pp_cp_ro)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  count by group
## W = 222235, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox_effsize(count ~ group, data = df_anova_pp_cp_ro) # r ( = Z/(√Nobs))
## # A tibble: 1 x 7
##   .y.   group1    group2    effsize    n1    n2 magnitude
## * <chr> <chr>     <chr>       <dbl> <int> <int> <ord>    
## 1 count cons_prog prog_prog   0.422   888   883 moderate

Inference: Mann-Whitney Post Hoc Tests (4 of 6)

# Mann-Whitney: pc vs cc
wilcox.test(count ~ group, data = df_anova_pc_cc_ro)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  count by group
## W = 777650, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox_effsize(count ~ group, data = df_anova_pc_cc_ro) # r ( = Z/(√Nobs))
## # A tibble: 1 x 7
##   .y.   group1    group2    effsize    n1    n2 magnitude
## * <chr> <chr>     <chr>       <dbl> <int> <int> <ord>    
## 1 count cons_cons prog_cons   0.668   944   973 large

Inference: Mann-Whitney Post Hoc Tests (5 of 6)

# Mann-Whitney: pc vs cp
wilcox.test(count ~ group, data = df_anova_pc_cp_ro)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  count by group
## W = 539666, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox_effsize(count ~ group, data = df_anova_pc_cp_ro) # r ( = Z/(√Nobs))
## # A tibble: 1 x 7
##   .y.   group1    group2    effsize    n1    n2 magnitude
## * <chr> <chr>     <chr>       <dbl> <int> <int> <ord>    
## 1 count cons_prog prog_cons   0.248   888   973 small

Inference: Mann-Whitney Post Hoc Tests (6 of 6)

# Mann-Whitney: cc vs cp
wilcox.test(count ~ group, data = df_anova_cc_cp_ro)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  count by group
## W = 587354, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox_effsize(count ~ group, data = df_anova_cc_cp_ro) # r ( = Z/(√Nobs))
## # A tibble: 1 x 7
##   .y.   group1    group2    effsize    n1    n2 magnitude
## * <chr> <chr>     <chr>       <dbl> <int> <int> <ord>    
## 1 count cons_cons cons_prog   0.400   944   888 moderate

Appendix: miscellaneous

Inference: LM ANOVA (comparing all groups)

# ANOVA
anova_model <- aov(count ~ group, data = df_anova)
summary(anova_model)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## group          3    627  209.04   201.5 <2e-16 ***
## Residuals   3996   4146    1.04                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Top progressive books by progressive users

# To view actual book https://www.goodreads.com/book/show/[insert book_id]
prog_books_by_prog_users <- filter(df_progressive_users, df_progressive_users$book_id %in% df_progressive_books$book_id)
head(prog_books_by_prog_users %>% count(book_id, sort = TRUE))
## # A tibble: 6 x 2
##   book_id     n
##     <dbl> <int>
## 1     706   454
## 2     475   141
## 3   16280   115
## 4    6956    78
## 5   16195    73
## 6   12612    47

Top conservative books by conservative users

# To view actual book https://www.goodreads.com/book/show/[insert book_id]
cons_books_by_cons_users <- filter(df_conservative_users, df_conservative_users$book_id %in% df_conservative_books$book_id)
cons_books_by_cons_users %>% count(book_id, sort = TRUE)
## # A tibble: 123 x 2
##    book_id     n
##      <dbl> <int>
##  1    5238   405
##  2   25082   166
##  3    5240    73
##  4    3871    49
##  5    3044    42
##  6   48625    42
##  7    5665    38
##  8   11138    35
##  9   25772    34
## 10   37663    27
## # … with 113 more rows

Distribution plots (square root transformed)

plot_sqrt <- variability_groups_plot(df_anova, df_anova$sqrt_count)
plot_sqrt

Checking nearly constant variance assumption

# Bartlett: Parametric test of the equality of variances
bartlett.test(log_count ~ group, data=df_anova_pp_cc)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  log_count by group
## Bartlett's K-squared = 51.705, df = 1, p-value = 6.45e-13
# Fligner-Killeen: Non-parametric test of the equality of variances
fligner.test(log_count ~ group, data=df_anova_pp_cc)
## 
##  Fligner-Killeen test of homogeneity of variances
## 
## data:  log_count by group
## Fligner-Killeen:med chi-squared = 32.736, df = 1, p-value = 1.055e-08

Venn diagram code

# Filter for the progressive and conservative books of progressive users
df_progressive_users_political_books <- df_progressive_users[
  (df_progressive_users$book_id %in% df_prog_and_cons_books$book_id), ]

# Filter for the progressive and conservative books of conservative users
df_conservative_users_political_books <- df_conservative_users[
  (df_conservative_users$book_id %in% df_prog_and_cons_books$book_id), ]
  
# Convert to venn diagram format
prog_only <- paste(df_progressive_users_political_books$book_id, sep="")
cons_only <- paste(df_conservative_users_political_books$book_id, sep="")

# Chart
venn.diagram(
  x = list(prog_only, cons_only),
  category.names = c("Progressive" , "Conservative"),
  filename = '#14_venn_diagramm.png',
  output=TRUE
)
## [1] 1

Venn diagram

Venn books